import pandas as pdimport plotly.express as pximport plotly.io as piofrom pyspark.sql import SparkSessionimport reimport numpy as npimport plotly.graph_objects as gofrom pyspark.sql.functions import col, split, explode, regexp_replace, transform, whenfrom pyspark.sql import functions as Ffrom pyspark.sql.functions import col, monotonically_increasing_idnp.random.seed(42)pio.renderers.default ="notebook"# Initialize Spark Sessionspark = SparkSession.builder.appName("LightcastData").getOrCreate()# Load Datadf = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")df.createOrReplaceTempView("job_postings")# Show Schema and Sample Data#print("---This is Diagnostic check, No need to print it in the final doc---")#df.printSchema() # comment this line when rendering the submission#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/17 20:42:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:> (0 + 1) / 1] [Stage 1:> (0 + 1) / 1] 25/10/17 20:43:05 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
2 Data Cleaning
from pyspark.sql.functions import coldf = df.withColumn("SALARY", col("SALARY").cast("float"))df = df.withColumn("SALARY_FROM", col("SALARY_FROM").cast("float"))df = df.withColumn("SALARY_TO", col("SALARY_TO").cast("float"))df = df.withColumn("MIN_YEARS_EXPERIENCE", col("MIN_YEARS_EXPERIENCE").cast("float"))df = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))# Compute median salarymedian_from = df.approxQuantile("SALARY_FROM", [0.5], 0.01)[0]median_to = df.approxQuantile("SALARY_TO", [0.5], 0.01)[0]median_salary = df.approxQuantile("SALARY", [0.5], 0.01)[0]print("Medians:",median_from, median_to, median_salary)# Impute missing 'SALARY_FROM' and 'SALARY_TO' with their mediansdf = df.fillna({"SALARY_FROM": median_from,"SALARY_TO": median_to,"SALARY": median_salary})# Compute 'AVERAGE_SALARY'df = df.withColumn("AVERAGE_SALARY", (col("SALARY_FROM") + col("SALARY_TO")) /2)# Impute missing 'SALARY' with AVERAGE_SALARY, and if that's missing, with the median salaryfrom pyspark.sql.functions import whendf = df.withColumn("SALARY", when( col("SALARY").isNull(), when(col("AVERAGE_SALARY").isNotNull(), col("AVERAGE_SALARY")) .otherwise(median_salary) ).otherwise(col("SALARY")))from pyspark.sql.functions import regexp_replacedf = df.withColumn("EDUCATION_LEVELS_NAME", regexp_replace(col("EDUCATION_LEVELS_NAME"), r'[\n\r]', ''))# Overwritedf.write.option("header", True).mode("overwrite").csv("data/lightcast_job_postings_cleaned.csv")# Display row countprint(f"Rows retained after cleaning: {df.count()}")
Salaries vary widely between industries, with sectors like Information and Finance & Insurance generally showing higher salary ranges than industries such as Accommodation and Food Services. Full-time positions tend to have higher median salaries across most industries compared to part-time or other employment types.
4 Salary Analysis by ONET Occupation Type (Bubble Chart)
# Lot Occupation Namesalary_analysis = spark.sql(""" SELECT LOT_OCCUPATION_NAME AS OCCUPATION_NAME, PERCENTILE(SALARY, 0.5) AS Median_Salary, COUNT(*) AS Job_Postings FROM job_postings GROUP BY LOT_OCCUPATION_NAME ORDER BY Job_Postings DESC LIMIT 10""")salary_pd = salary_analysis.toPandas()import plotly.express as pxfig = px.scatter( salary_pd, x="OCCUPATION_NAME", y="Median_Salary", size="Job_Postings", title="Salary Analysis by LOT Occupation Type (Bubble Chart)", labels={"OCCUPATION_NAME": "LOT Occupation","Median_Salary": "Median Salary","Job_Postings": "Number of Job Postings" }, hover_name="OCCUPATION_NAME", size_max=60, width=1000, height=600, color="Job_Postings", color_continuous_scale="Viridis",)# Layout Customizationfig.update_layout( font_family="Arial", font_size=14, title_font_size=25, xaxis_title="LOT Occupation", yaxis_title="Median Salary", plot_bgcolor="#f6f9fa", xaxis=dict( tickangle=-45, showline=True, linecolor="#444" ), yaxis=dict( showline=True, linecolor="#444" ), xaxis_title_font=dict(size=17), yaxis_title_font=dict(size=17),)fig.show()